# First prep 2017-19 electric data
years = 2017:2019
quarters = 1:4
type = "Electric"
pge_elec = NULL
for(year in years) {
for(quarter in quarters) {
filename = paste0("PGE_", year, "_Q", quarter, "_", type, "UsageByZip.csv")
temp = read_csv(filename)
pge_elec = rbind(pge_elec, temp)
saveRDS(pge_elec, "pge_elec.rds")
}
}
# Add 2020 data manually
pge_elec_20_q1 = read_csv("PGE_2020_Q1_ElectricUsageByZip.csv")
pge_elec_20_q2 = read_csv("PGE_2020_Q2_ElectricUsageByZip.csv")
pge_elec = rbind(pge_elec, pge_elec_20_q1)
pge_elec = rbind(pge_elec, pge_elec_20_q2)
# Convert kWhs to kBTUs (1 kWh = 3412.14 BTUs = 3.412 kBTUs)
# Drop TOTALKWH and AVERAGEKWH
# Filter for commercial and residential electricity
pge_elec = pge_elec %>%
mutate(TOTALKBTU = TOTALKWH*3.412) %>%
select(!c(TOTALKWH, AVERAGEKWH)) %>%
filter(CUSTOMERCLASS %in% c("Elec- Residential", "Elec- Commercial"))
# Then prep 2017-19 gas data
years = 2017:2019
quarters = 1:4
type = "Gas"
pge_gas = NULL
for(year in years) {
for(quarter in quarters) {
filename = paste0("PGE_", year, "_Q", quarter, "_", type, "UsageByZip.csv")
temp = read_csv(filename)
pge_gas = rbind(pge_gas, temp)
saveRDS(pge_gas, "pge_gas.rds")
}
}
# Add 2020 data manually
pge_gas_20_q1 = read_csv("PGE_2020_Q1_GasUsageByZip.csv")
pge_gas_20_q2 = read_csv("PGE_2020_Q2_GasUsageByZip.csv")
pge_gas = rbind(pge_gas, pge_gas_20_q1)
pge_gas = rbind(pge_gas, pge_gas_20_q2)
# Convert therms to kBTUs (1 therm = 100,000 BTUs = 100 kBTUs)
# Drop TOTALTHM and AVERAGETHM
# Filter for commercial and residential gas
pge_gas = pge_gas %>%
mutate(TOTALKBTU = TOTALTHM*100) %>%
select(!c(TOTALTHM, AVERAGETHM)) %>%
filter(CUSTOMERCLASS %in% c("Gas- Residential", "Gas- Commercial"))
# Combine datasets
# Add column for average kBTUs per customer
# Add column for month and year combined (assume 1st day of month)
pge_all = NULL
pge_all = rbind(pge_all, pge_elec)
pge_all = rbind(pge_all, pge_gas)
pge_all = mutate(pge_all, AVERAGEKBTU = TOTALKBTU/TOTALCUSTOMERS)
pge_all$DATE = as.yearmon(paste(pge_all$YEAR, pge_all$MONTH), "%Y %m")
# Drop duplicate entries (there are duplicate entries in September 2017)
# Note that there is only one customer of each type in each zip code for each month, so aggregated together, the zip code, month, year, and type serve as a kind of unique ID.
pge_all = pge_all[!duplicated(pge_all[c(1,2,3,4)]),]
# Show dataset
pge_all
## # A tibble: 119,850 x 9
## ZIPCODE MONTH YEAR CUSTOMERCLASS COMBINED TOTALCUSTOMERS TOTALKBTU
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 93101 1 2017 Elec- Commer… Y 0 0
## 2 93101 2 2017 Elec- Commer… Y 0 0
## 3 93101 3 2017 Elec- Commer… Y 0 0
## 4 93105 1 2017 Elec- Commer… Y 0 0
## 5 93105 2 2017 Elec- Commer… Y 0 0
## 6 93105 3 2017 Elec- Commer… Y 0 0
## 7 93110 1 2017 Elec- Commer… Y 0 0
## 8 93110 2 2017 Elec- Commer… Y 0 0
## 9 93110 3 2017 Elec- Commer… Y 0 0
## 10 93117 1 2017 Elec- Commer… Y 0 0
## # … with 119,840 more rows, and 2 more variables: AVERAGEKBTU <dbl>,
## # DATE <yearmon>
# Getting list of Bay Area zip codes
ca_counties <- counties("CA", cb = T, progress_bar = F)
bay_county_names <-
c(
"Alameda",
"Contra Costa",
"Marin",
"Napa",
"San Francisco",
"San Mateo",
"Santa Clara",
"Solano",
"Sonoma"
)
bay_counties = ca_counties %>%
filter(NAME %in% bay_county_names)
usa_zips = zctas(cb = TRUE, progress_bar = FALSE)
bay_zips = usa_zips %>%
st_centroid() %>%
.[bay_counties, ] %>%
st_set_geometry(NULL) %>%
left_join(usa_zips %>% select(GEOID10)) %>%
st_as_sf()
# Reducing dataset to Bay Area entities
pge_bay = pge_all %>%
mutate(ZIPCODE = ZIPCODE %>% as.character()) %>%
filter(ZIPCODE %in% bay_zips$GEOID10)
This chart shows monthly total kBTUs of residential and commercial electricity and gas consumption in the Bay Area from Q1 2017 to Q2 2020.
plot_ly() %>% add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Elec- Commercial"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Commercial (Electricity)") %>%
add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Gas- Commercial"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Commercial (Gas)") %>%
add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Elec- Residential"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Residential (Electricity)") %>%
add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Gas- Residential"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Residential (Gas)") %>%
layout(xaxis = list(title = "Date",
fixedrange = T),
yaxis = list(title = "kBTU",
fixedrange = T),
barmode = "stack",
legend = list(title = list(text = "Energy User"))) %>%
config(displayModeBar = F)
Visual inspection suggests that in 2020, there was a drop in commercial electricity usage in April, but commercial electricity use rebounded to slightly below pre-COVID levels by May. There was also a sharp decline in commercial gas use from March 2020 in April, and this decline continued into June 2020. The decline in commercial gas usage is expected, since many Bay Area workers transitioned to working from home, and offices had less need for using their heating and A/C systems. It is somewhat surprising that electricity use rebounded in May 2020, but my guess is that manufacturing facilities comprise the bulk of commercial electricity usage, and unlike offices, most manufacturing facilities did not close down due to COVID since they were deemed “essential.” Anecdotally, my mother works at a medical devices company, and while all office workers have been working from home since February, the factories only closed down briefly and reopened for in-person work as soon as social distancing protocols were established.
Residential electric usage increased from April to June 2020, in line with what one would expect from increased remote work. Residential gas usage was substantially higher in April 2020 than in April of previous years and slightly higher in June 2020 than previous Junes; this is also consistent with increased remote work.
plot_ly() %>% add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Elec- Commercial"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Commercial (Electricity)") %>%
add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Gas- Commercial"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Commercial (Gas)") %>%
layout(xaxis = list(title = "Date",
fixedrange = T),
yaxis = list(title = "kBTU",
fixedrange = T),
barmode = "stack",
legend = list(title = list(text = "Energy Type"))) %>%
config(displayModeBar = F)
plot_ly() %>% add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Elec- Residential"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Residential (Electricity)") %>%
add_trace(data = pge_bay %>%
filter(CUSTOMERCLASS == "Gas- Residential"),
x = ~DATE %>% factor(),
y = ~TOTALKBTU,
type = "bar",
name = "Residential (Gas)") %>%
layout(xaxis = list(title = "Date",
fixedrange = T),
yaxis = list(title = "kBTU",
fixedrange = T),
barmode = "stack",
legend = list(title = list(text = "Energy Type"))) %>%
config(displayModeBar = F)
There are many ways we can measure changes in energy consumption before and after COVID. We can look at: * the difference between February and April 2020 use; * the difference between February 2020 use and average use in April through June; * the difference between April, May, or June use in 2020 and average total use in the same month in the previous three years; * the difference between average use in April, May, and June 2020 and average use across all three months across in any previous year, among other options.
For this exercise, I will find the difference between April use in 2020 and average use in April of 2017-19 for each zip code and plot this difference for every neighborhood. I drop all zip codes where there was no electricity use recorded in any month.
# Create the new dataset
# Widen the dataset so that each year has a column for its usage values
# Because total customers may vary slightly from year to year, the rows don't collapse automatically, so collapse the rows.
# Create new column representing difference between 2020 and 2017-19 average usage
# Split steps because otherwise there is an error generating the average usage column
# Append geometry information
pge_map = pge_bay %>% filter(MONTH == "4" &
(CUSTOMERCLASS == "Elec- Commercial" |
CUSTOMERCLASS == "Elec- Residential")) %>%
select(!c(MONTH, COMBINED, DATE, TOTALCUSTOMERS)) %>%
spread(YEAR, TOTALKBTU, fill = 0) %>%
group_by(ZIPCODE) %>%
summarize(`2017` = sum(`2017`, na.rm = T),
`2018` = sum(`2018`, na.rm = T),
`2019` = sum(`2019`, na.rm = T),
`2020` = sum(`2020`, na.rm = T)) %>%
filter(!(`2017` == 0 &
`2018` == 0 &
`2019` == 0 &
`2020` == 0))
pge_map = pge_map %>% mutate(AVG = rowMeans(pge_map[,c("2017", "2018", "2019")],
na.rm = T),
DIFF = `2020` - AVG) %>%
left_join(bay_zips %>% select(GEOID10),
by = c("ZIPCODE" = "GEOID10")) %>%
st_as_sf() %>%
st_transform(4326)
# Draw map
res_pal = colorNumeric(palette = "Blues",
domain = pge_map$DIFF)
leaflet() %>% addTiles() %>%
addPolygons(data = pge_map,
fillColor = ~res_pal(DIFF),
color = "white",
opacity = 0.5,
fillOpacity = 0.5,
weight = 1,
label = ~paste0(DIFF, " kBTU change in ", ZIPCODE),
highlightOptions = highlightOptions(weight = 2,
opacity = 1)) %>%
addLegend(data = pge_map,
pal = res_pal,
values = ~DIFF,
title = "Difference in Electricity<br>Consumption between<br>April 2017-19 and 2020")
One issue that quickly becomes clear is that changes in total electricity consumption, as we did for the charts above, neglect to take into account changes in total customers. Visual inspection would suggest that there was a dramatic increase in energy consumption in zip code 94545, but in fact that increase is simply because there were about 7,000 more PG&E Electric customers in that zip code in April 2020 than in all previous Aprils. We need to show changes in use per customer instead.
# Same as above
pge_map_avg = pge_bay %>% filter(MONTH == "4" &
(CUSTOMERCLASS == "Elec- Commercial" |
CUSTOMERCLASS == "Elec- Residential")) %>%
select(!c(MONTH, COMBINED, DATE, TOTALCUSTOMERS, TOTALKBTU)) %>%
spread(YEAR, AVERAGEKBTU, fill = 0) %>%
group_by(ZIPCODE) %>%
summarize(`2017` = sum(`2017`, na.rm = T),
`2018` = sum(`2018`, na.rm = T),
`2019` = sum(`2019`, na.rm = T),
`2020` = sum(`2020`, na.rm = T)) %>%
filter(!(`2017` == 0 &
`2018` == 0 &
`2019` == 0 &
`2020` == 0))
pge_map_avg = pge_map_avg %>% mutate(AVG = rowMeans(pge_map_avg[,c("2017", "2018", "2019")],
na.rm = T),
DIFF = `2020` - AVG) %>%
left_join(bay_zips %>% select(GEOID10),
by = c("ZIPCODE" = "GEOID10")) %>%
st_as_sf() %>%
st_transform(4326)
# Draw map
res_pal = colorNumeric(palette = "Reds",
domain = pge_map_avg$DIFF,
reverse = TRUE)
leaflet() %>% addTiles() %>%
addPolygons(data = pge_map_avg,
fillColor = ~res_pal(DIFF),
color = "white",
opacity = 0.5,
fillOpacity = 0.5,
weight = 1,
label = ~paste0(DIFF, " kBTU change in ", ZIPCODE),
highlightOptions = highlightOptions(weight = 2,
opacity = 1)) %>%
addLegend(data = pge_map_avg,
pal = res_pal,
values = ~DIFF,
title = "Difference in Electricity<br>Consumption per Customer<br>between April 2017-19<br>and 2020")
Sure enough, electricity consumption per customer fell in almost every zip code. Here are some notable drops:
Fun fact: PG&E does not sell electricity in the city of Santa Clara, where I live! The city of Santa Clara runs its own electric utility service, Silicon Valley Power. This is why there is no data for the city of Santa Clara’s electric usage.
# Repeat data cleaning steps in creation of pge_map
pge_map_comm = pge_bay %>% filter(MONTH == "4" &
CUSTOMERCLASS == "Elec- Commercial") %>%
select(!c(MONTH, COMBINED, DATE, TOTALCUSTOMERS, TOTALKBTU)) %>%
spread(YEAR, AVERAGEKBTU, fill = 0) %>%
filter(!(`2017` == 0 &
`2018` == 0 &
`2019` == 0 &
`2020` == 0))
pge_map_comm = pge_map_comm %>%
mutate(AVG = rowMeans(pge_map_comm[,c("2017", "2018", "2019")],
na.rm = T),
DIFF = `2020` - AVG) %>%
left_join(bay_zips %>% select(GEOID10),
by = c("ZIPCODE" = "GEOID10")) %>%
st_as_sf() %>%
st_transform(4326)
# Draw map
res_pal = colorNumeric(palette = "Blues",
domain = pge_map_comm$DIFF,
reverse = TRUE)
leaflet() %>% addTiles() %>%
addPolygons(data = pge_map_comm,
fillColor = ~res_pal(DIFF),
color = "white",
opacity = 0.5,
fillOpacity = 0.5,
weight = 1,
label = ~paste0(DIFF, " kBTU change in ", ZIPCODE),
highlightOptions = highlightOptions(weight = 2,
opacity = 1)) %>%
addLegend(data = pge_map_comm,
pal = res_pal,
values = ~DIFF,
title = "Difference in Commercial<br>Electricity Consumption<br>per Customer<br>between April 2017-19<br>and 2020")
Commercial energy consumption fell dramatically in April 2020 compared to the same month in the past three years. The four declines noted above are reflected here.
# Repeat setup steps in creation of pge_map
pge_map_res = pge_bay %>% filter(MONTH == "4" &
CUSTOMERCLASS == "Elec- Residential") %>%
select(!c(MONTH, COMBINED, DATE, TOTALCUSTOMERS, TOTALKBTU)) %>%
spread(YEAR, AVERAGEKBTU, fill = 0) %>%
filter(!(`2017` == 0 &
`2018` == 0 &
`2019` == 0 &
`2020` == 0))
pge_map_res = pge_map_res %>%
mutate(AVG = rowMeans(pge_map_res[,c("2017", "2018", "2019")],
na.rm = T),
DIFF = `2020` - AVG) %>%
left_join(bay_zips %>% select(GEOID10),
by = c("ZIPCODE" = "GEOID10")) %>%
st_as_sf() %>%
st_transform(4326)
# Draw map
res_pal = colorNumeric(palette = "Blues",
domain = pge_map_res$DIFF)
leaflet() %>% addTiles() %>%
addPolygons(data = pge_map_res,
fillColor = ~res_pal(DIFF),
color = "white",
opacity = 0.5,
fillOpacity = 0.5,
weight = 1,
label = ~paste0(DIFF, " kBTU change in ", ZIPCODE),
highlightOptions = highlightOptions(weight = 2,
opacity = 1)) %>%
addLegend(data = pge_map_res,
pal = res_pal,
values = ~DIFF,
title = "Difference in Residential<br>Electricity Consumption<br>per Customer<br>between April 2017-19<br>and 2020")
In keeping with the increase in working from home, residential electricity consumption increased substantially across the Bay Area relative to the same period in previous years. Notable changes:
One important note about the PG&E data is that it does not always collect information from every zip code in every year. This is a major reason why I wound up averaging data in April 2017, 2018, and 2019: trying to compare a single year to another single year can lead to missing or inaccurate estimates of changes in electricity consumption.
Additionally, as briefly alluded to in the data preparation section, the September 2017 data is duplicated in its corresponding CSV. However, the duplicates are not exact (that is, duplicate entries contain slightly varying counts of customers and total electricity use), and there is no explanation for why they are different. This raises the question of why they are different and which one of each pair of duplicates an analyst should keep. I think that in these kinds of situations, to avoid biasing estimates upwards or downwards, we should randomly select which version is included in the final dataset. Although I do not know how to implement this random selection in R, I attempted to write code that was agnostic to which version wound up being included.